{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "president_NLP.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python2",
      "display_name": "Python 2"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/AshwinDeshpande96/Speech-Generation/blob/master/president_NLP.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "psXhMobHtg5z",
        "colab_type": "text"
      },
      "source": [
        "# Import Library and Data"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HJuPGQtSjp5l",
        "colab_type": "code",
        "outputId": "ce8a5990-9d7c-4a82-c85f-9456860b4062",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "import numpy\n",
        "from keras.models import Model, load_model, Sequential\n",
        "from keras.layers import Dense, Dropout, LSTM, Input\n",
        "from keras.utils import np_utils\n",
        "from google.colab import drive\n",
        "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
        "from keras.optimizers import Adam\n",
        "import sys\n",
        "import inflect\n",
        "import re\n",
        "import nltk\n",
        "from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer\n",
        "import spacy\n",
        "import codecs\n",
        "import pandas as pd"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Using TensorFlow backend.\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MNko6g70t7Z0",
        "colab_type": "text"
      },
      "source": [
        "WordNet contains the lemmatizer required to derive root words from its inflections."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NG5snBTtxKVG",
        "colab_type": "code",
        "outputId": "183f366e-7ce5-43a9-dfa4-6b3bff4caba2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 68
        }
      },
      "source": [
        "nltk.download(\"wordnet\")"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/wordnet.zip.\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4T7qyIZeuN7O",
        "colab_type": "text"
      },
      "source": [
        "I have my files stored on my Google Drive. The Presidential Speech Corpus is imported from this link.\n",
        "\n",
        "You can choose to upload data to your drive or directly to Google Colaboratory (This implementation is done on Google Colab since it takes care of dependencies).\n",
        "Use following code to directly upload file.\n",
        "\n",
        "\n",
        "```\n",
        "from google.colab import files\n",
        "uploaded = files.upload()\n",
        "```\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "x243ce7Zj_is",
        "colab_type": "code",
        "outputId": "6bc51c5a-4004-4d3d-9904-312fcc95e506",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 122
        }
      },
      "source": [
        "#drive.mount('/content/gdrive', force_remount=True)\n",
        "drive.mount('/content/gdrive')"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n",
            "\n",
            "Enter your authorization code:\n",
            "··········\n",
            "Mounted at /content/gdrive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bCWl4xQ1vg28",
        "colab_type": "text"
      },
      "source": [
        "Replace the file_path with the location you have saved your file."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "90OaffXpkcPx",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "president = 'lbjohnson'\n",
        "file_path = '/content/gdrive/My Drive/Projects/NLP/President Speech/' +president+'_all.txt'"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lp1_ZrlkvouS",
        "colab_type": "text"
      },
      "source": [
        "You can choose to open file in Python Standard Library style: `raw_text = open(file_path).read()`. \n",
        "\n",
        "But during pruning Spacy Library Requires Unicode Standard text type and Python Standard open() command returns a str datatype. \n",
        "\n",
        "For this purpose codecs libraries is used."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "P-HfTCgojyJm",
        "colab_type": "code",
        "outputId": "aedcd4a9-0e94-4ef9-caf8-4fa15bc32004",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "raw_text = codecs.open(file_path, encoding='utf-8').read()\n",
        "print type(raw_text)"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "<type 'unicode'>\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7clkDFfSttBa",
        "colab_type": "text"
      },
      "source": [
        "# Data Pre-Processing and Pruning\n",
        "This part takes care of tokenizing words and creating word vector and vocabulary vector."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KwQA3RHWwsWM",
        "colab_type": "text"
      },
      "source": [
        "tokenize function as the name specifies will return a list of tokenized words from raw_text: \n",
        "\n",
        "For example: \n",
        "raw_text =  \"Today's weather condition is cloudy with a 76% of rain. Temperature may remain \n",
        "    cool at 21°C with Humidity 61%.\"\n",
        "    \n",
        " raw_words = ['today', ' ', 'weath', ' ', 'condit', ' ', 'is', ' ', 'cloudy', ' ', 'with', ' ', 'a', ' ', 'seventy', ' ', 'six', ' ', 'of', ' ', 'rain', ' ', '.',  ' ', 'temp',  ' ', 'may',  ' ', 'remain',  ' ', 'cool',  ' ', 'at',    ' ', ',  ' ', 'twenty',  ' ',  'on', ' ',  'c',  ' ', 'with',  ' ', 'humid',  ' ', 'sixt', ' ', 'on', ' ', '.']\n",
        " \n",
        " Do not run this module twice as the function changes the input variable and second execution will consider the altered variable. Make sure to run codes.open() function before tokenize function.\n",
        "    "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LBLmMw1Kxq9w",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "porter = PorterStemmer()\n",
        "lancaster=LancasterStemmer()\n",
        "wordnet_lemmatizer = WordNetLemmatizer()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3Y-InccOo8TB",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "def tokenize(raw_text):\n",
        "    #raw_text = raw_text.lower()\n",
        "    num_set = re.findall(r'\\d+', raw_text)\n",
        "    p = inflect.engine()\n",
        "    #print num_set\n",
        "    for num in num_set:\n",
        "        word = str(p.number_to_words(num))\n",
        "        raw_text = raw_text.replace(num, word)\n",
        "    punctuation_chars = [\"!\", '\"', '&', ',', '?', '/', ':', ';', '<', '>', '$', '#', '@', '*', '(', ')', '[', ']', '{', '}', '\\n', '-', '`'] \n",
        "    for symbol in punctuation_chars:\n",
        "        raw_text = raw_text.replace(symbol, ' ')\n",
        "    raw_text = raw_text.replace('.', ' . ')\n",
        "    raw_text = raw_text.replace( \"'\", \"\" )\n",
        "    raw_text = raw_text.replace( \"%\", \"percent\" )\n",
        "    \n",
        "    \n",
        "    spacy_nlp = spacy.load('en_core_web_sm',  disable=['ner', 'textcat']) \n",
        "    doc = spacy_nlp(raw_text[:len(raw_text)/2])\n",
        "    raw_words = [token.text for token in doc]\n",
        "    doc = spacy_nlp(raw_text[len(raw_text)/2:])\n",
        "    raw_words = raw_words + [token.text for token in doc]\n",
        "    return raw_text, raw_words\n",
        "raw_text, raw_words = tokenize(raw_text)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OnZq2rMW0-Wi",
        "colab_type": "text"
      },
      "source": [
        "Further Lemmatization and removing extra spaces in done in `text_preprocess()`."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3KYX_NJ7SQ6i",
        "colab_type": "code",
        "outputId": "ae02f60a-7f09-464a-b3cd-7bab92665cbb",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "word_copy = raw_words\n",
        "def text_preprocess(raw_words):\n",
        "    stripped = []\n",
        "    raw_dict = {}\n",
        "    for val in raw_words:\n",
        "        if \" \" not in val:\n",
        "            root = str(wordnet_lemmatizer.lemmatize(lancaster.stem(val)))\n",
        "            stripped.append(root)\n",
        "            raw_dict[root] = str(val)\n",
        "    raw_words = numpy.array(stripped)\n",
        "    print \"Text Word Count: \", raw_words.shape[0]\n",
        "    vocab = numpy.unique(sorted(raw_words))\n",
        "    print \"Vocab Length: \", vocab.shape[0]\n",
        "    return vocab.shape[0],raw_words.shape[0], raw_dict, raw_words, vocab\n",
        "    \n",
        "n_vocab, n_words, raw_dict, raw_words, vocab = text_preprocess(raw_words)"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Text Word Count:  267474\n",
            "Vocab Length:  5161\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GV6ZXA8m1Vw1",
        "colab_type": "text"
      },
      "source": [
        "Raw Text Processing is a costly process and will not be executing this cell for every experiment. Hence we save a copy of these raw words for further repeated usage."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "7tZfwdi0T4Cx",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "df = pd.DataFrame(raw_words, columns=[\"raw_words\"])\n",
        "df.to_csv('/content/gdrive/My Drive/Projects/NLP/President Speech/words.csv', )"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uM6XyFzl1k7_",
        "colab_type": "text"
      },
      "source": [
        "A sample of vocabulary is printed as follows. Uncomment line 2 to see processed words which are in same sequence as the raw text"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uRqptu503Vnp",
        "colab_type": "code",
        "outputId": "485c0a40-222e-4e05-a152-03c2d27e0f33",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 170
        }
      },
      "source": [
        "print \"Vocab: \",vocab[:50], \"\\n...\\n\", vocab[-10:]\n",
        "#print \"Raw Words: \",raw_words[:50], \"\\n...\\n\", raw_words[-10:]\n"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Vocab:  ['.' 'a' 'ab' 'abandon' 'abbrevy' 'abc' 'abel' 'abet' 'abh' 'abhor' 'abid'\n",
            " 'abl' 'ablest' 'aboard' 'abol' 'abolit' 'about' 'abov' 'abraham' 'abram'\n",
            " 'abroad' 'abrupt' 'absolv' 'absorb' 'abstract' 'abund' 'abus' 'academy'\n",
            " 'acc' 'acceiv' 'accel' 'accentu' 'access' 'accid' 'accommod' 'accompany'\n",
            " 'accompl' 'accord' 'account' 'accredit' 'accru' 'accum' 'accus'\n",
            " 'accustom' 'ach' 'achiev' 'ackley' 'acknowledg' 'acquaint' 'acquiesc'] \n",
            "...\n",
            "['yourselv' 'yugoslav' 'zagor' 'zeal' 'zealand' 'zero' 'zerotwo' 'zip'\n",
            " 'zon' 'zoot']\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WIirT-0f4Dy_",
        "colab_type": "text"
      },
      "source": [
        "Following code cells are used to convert text data to numeric vectors such that our LSTM Model can use to find patterns."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pE2KHJX9lEMv",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "#chars = sorted(list(set(raw_text)))\n",
        "vocab_to_int = dict((c, i) for i, c in enumerate(vocab))"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "18M8wcT5lbFv",
        "colab_type": "code",
        "outputId": "367018bc-006a-4890-db0c-feef4fbdf083",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "# prepare the dataset of input to output pairs encoded as integers\n",
        "seq_length = 100\n",
        "dataX = []\n",
        "dataY = []\n",
        "for i in range(0, n_words - seq_length, 1):\n",
        "\tseq_in = raw_words[i:i + seq_length]\n",
        "\tseq_out = raw_words[i + seq_length]\n",
        "\tdataX.append([vocab_to_int[word] for word in seq_in])\n",
        "\tdataY.append(vocab_to_int[seq_out])\n",
        "n_patterns = len(dataX)\n",
        "print \"Total Patterns: \", n_patterns"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Total Patterns:  267374\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MVBXfursnDoo",
        "colab_type": "code",
        "outputId": "805809d4-4c2e-4a63-dbb6-7eb22289c600",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        }
      },
      "source": [
        "print numpy.array(dataX).shape"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(267374, 100)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "LKefWPepr3gJ",
        "colab_type": "code",
        "outputId": "e65a5b4f-f6df-4d51-a422-0826ad3297d7",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "# reshape X to be [samples, time steps, features]\n",
        "X = numpy.reshape(dataX, (n_patterns, seq_length, 1))\n",
        "# normalize\n",
        "X = X / float(n_vocab)\n",
        "print X.shape\n",
        "\n"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(267374, 100, 1)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "B-UH_UqZPVGg",
        "colab_type": "code",
        "outputId": "af6f69e6-cf79-40b9-ac7d-7bc1bf6029c0",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "# one hot encode the output variable\n",
        "y = np_utils.to_categorical(dataY)\n",
        "print y.shape"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(267374, 5161)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "057m5M1_-FxY",
        "colab_type": "text"
      },
      "source": [
        "# Training"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4rD5P-yj6dDi",
        "colab_type": "text"
      },
      "source": [
        "There are two different models you can choose. First one is a deeper and takes longer time to train. You can use either as this model has no notion of overfitting. It is designed to predict values very similar to train data. Hence,  there is no requirement of generalization."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Bk_9ueRVsvyd",
        "colab_type": "code",
        "outputId": "df777cfa-8dea-4237-b107-44d48f030903",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 408
        }
      },
      "source": [
        "model = Sequential()\n",
        "model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))\n",
        "model.add(Dropout(0.2))\n",
        "model.add(LSTM(512, return_sequences=True))\n",
        "model.add(Dropout(0.2))\n",
        "model.add(LSTM(256))\n",
        "model.add(Dense(256))\n",
        "model.add(Dropout(0.2))\n",
        "model.add(Dense(y.shape[1], activation='softmax'))\n",
        "model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1.0e-3))\n",
        "\n",
        "'''\n",
        "model = Sequential()\n",
        "model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))\n",
        "model.add(Dropout(0.2))\n",
        "model.add(Dense(y.shape[1], activation='softmax'))\n",
        "model.compile(loss='categorical_crossentropy', optimizer='adam')\n",
        "'''\n",
        "model.summary()"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "_________________________________________________________________\n",
            "Layer (type)                 Output Shape              Param #   \n",
            "=================================================================\n",
            "lstm_4 (LSTM)                (None, 100, 256)          264192    \n",
            "_________________________________________________________________\n",
            "dropout_4 (Dropout)          (None, 100, 256)          0         \n",
            "_________________________________________________________________\n",
            "lstm_5 (LSTM)                (None, 100, 512)          1574912   \n",
            "_________________________________________________________________\n",
            "dropout_5 (Dropout)          (None, 100, 512)          0         \n",
            "_________________________________________________________________\n",
            "lstm_6 (LSTM)                (None, 256)               787456    \n",
            "_________________________________________________________________\n",
            "dense_3 (Dense)              (None, 256)               65792     \n",
            "_________________________________________________________________\n",
            "dropout_6 (Dropout)          (None, 256)               0         \n",
            "_________________________________________________________________\n",
            "dense_4 (Dense)              (None, 5161)              1326377   \n",
            "=================================================================\n",
            "Total params: 4,018,729\n",
            "Trainable params: 4,018,729\n",
            "Non-trainable params: 0\n",
            "_________________________________________________________________\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Pj1Hq7eW97uw",
        "colab_type": "text"
      },
      "source": [
        "Start Training"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EhtW_JRgt8l_",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# define the checkpoint\n",
        "filepath=\"/content/gdrive/My Drive/Projects/NLP/President Speech/weights/\"+president+\"_weights.hdf5\"\n",
        "\n",
        "callbacks = [\n",
        "    EarlyStopping(monitor='loss', patience=10, verbose=0, mode='min'),\n",
        "    ReduceLROnPlateau(monitor='loss', factor=0.1, patience=5, verbose=1, mode='min'),\n",
        "    ModelCheckpoint(filepath, save_best_only=True,  save_weights_only=False, monitor='loss', mode='min')\n",
        "]\n",
        "model.fit(X, y, epochs=1, batch_size=128, callbacks=callbacks)\n",
        "model.save('/content/gdrive/My Drive/Projects/NLP/President Speech/weights/'+president+'_model.h5')\n",
        "for i in range(25):\n",
        "    model = load_model('/content/gdrive/My Drive/Projects/NLP/President Speech/weights/'+president+'_model.h5')\n",
        "    model.fit(X, y, epochs=10, batch_size=128, callbacks=callbacks)\n",
        "    model.save('/content/gdrive/My Drive/Projects/NLP/President Speech/weights/'+president+'_model.h5')\n",
        "#model.load_weights('/content/gdrive/My Drive/Projects/NLP/weights-improvement-20-1.9923.hdf5')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fAFtJOjH-DWO",
        "colab_type": "text"
      },
      "source": [
        "# Testing  "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nfciOJS7QMot",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "model = load_model('/content/gdrive/My Drive/Projects/NLP/President Speech/'+president+'_model.hdf5')\n",
        "int_to_vocab = dict((i, c) for i, c in enumerate(vocab))\n",
        "#print int_to_vocab"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vbSfmaaSRBUo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "start = numpy.random.randint(0, len(dataX)-1)\n",
        "#start = len(dataX)-150\n",
        "pattern = dataX[start]\n",
        "print \"Seed:\"\n",
        "#print \"\\\"\", ''.join([), \"\\\"\"\n",
        "nextline = 25\n",
        "for x in [raw_dict[int_to_vocab[value]] for value in pattern]:\n",
        "    nextline = nextline - 1\n",
        "    if nextline == 0:\n",
        "        print \"\"\n",
        "        nextline = 25\n",
        "    print x,\n",
        "result = []\n",
        "# generate characters\n",
        "for i in range(100):\n",
        "\tx = numpy.reshape(pattern, (1, len(pattern), 1))\n",
        "\tx = x / float(n_vocab)\n",
        "\tprediction = model.predict(x, verbose=0)\n",
        "\tindex = numpy.argmax(prediction)\n",
        "\tresult.append(int_to_vocab[index])\n",
        "\tseq_in = [int_to_vocab[value] for value in pattern]\n",
        "\tpattern.append(index)\n",
        "\tpattern = pattern[1:len(pattern)]\n",
        "print \"---\",\n",
        "nextline = 25\n",
        "for w in result:\n",
        "    nextline = nextline - 1\n",
        "    if nextline == 0:\n",
        "        print \"\"\n",
        "        nextline = 25\n",
        "    print raw_dict[w],\n",
        "print \"\\nDone.\""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}